To assess the diversity within samples, we did a clustering based on distance (<0.10). Each cluster is summarized by a representative sequence (the longest).
library(dendextend)
library(Biostrings)
library(DECIPHER)
get_clusters_and_consensus=function(file, cutoff=0.1){
seqs=readDNAStringSet(file)
seqs=RemoveGaps(seqs)
alignement=AlignSeqs(seqs, processors = NULL)
d=DistanceMatrix(myXStringSet = alignement, correction = "Jukes-Cantor", processors = NULL, penalizeGapGapMatches=FALSE, penalizeGapLetterMatches=F)
pdf( file = paste0(file, ".pdf"))
clusters=IdClusters(d, cutoff = cutoff,showPlot = T, method = "NJ")
title(main = file)
dev.off()
n_clusters=max(clusters$cluster)
# get the longest of each cluster
representant_liste=list()
for (i in 1:n_clusters){
aln=alignement[which(x = clusters$cluster==i)]
n_sequ=length(aln)
if (n_sequ==1){
representant_liste[[i]]=aln[1]
}
if (n_sequ>1){
# get the length of each
length=apply(alphabetFrequency(aln)[,1:4], 1, sum)
# choose the longest
longest=which(length==max(length))[1]
#consensus_i=ConsensusSequence(aln)
representant_i=aln[longest]
representant_liste[[i]]=representant_i
}
# change name
base_name=lapply(strsplit(file, split="_"), FUN = function(x){
res=paste0(x[1], "_", x[2])})
names(representant_liste[[i]])=paste0(base_name,"_c", i, "_n=", n_sequ)
}
# save the representant per clusters
representants = unlist(representant_liste)
representants = do.call(c, representants)
writeXStringSet(x = reverseComplement(representants), filepath = paste0(file, "_representants.fa"))
# save the alignment ordered by cluster (reverse complement to be in the correct orientation)
read=c(1:dim(clusters)[1])
clusters=data.frame(read, clusters)
clusters=clusters[order(clusters$cluster, decreasing = F),]
alignement_ordered=alignement[clusters$read]
names(alignement_ordered)=paste0(clusters$read, "_c", clusters$cluster)
writeXStringSet(x = reverseComplement(alignement_ordered), filepath = paste0(file, "_ordered.aln"))
return(representant_liste)
}
###################################################################################################
# list of files to analyze
ls=list.files(path = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET/", pattern = "*.fasta_short.fa$")
setwd(dir = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET/")
consensus_per_cluster_liste=list()
for (i in 1:length(ls)){
print(paste0("sample number:", i," ", ls[i]))
consensus_per_cluster_liste[[i]]=get_clusters_and_consensus(file = ls[i], cutoff = 0.1)
}
length(consensus_per_cluster_liste)
consensus_per_cluster_aln = unlist(consensus_per_cluster_liste)
consensus_per_cluster_aln = do.call(c, consensus_per_cluster_aln)
# remove gaps
consensus_per_cluster_aln=RemoveGaps(consensus_per_cluster_aln)
# align
consensus_per_cluster_aln = AlignSeqs(myXStringSet = consensus_per_cluster_aln)
#######################################
# write alignments
#######################################
writeXStringSet(consensus_per_cluster_aln, filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET/consensus_clusters.aln")
# looking at the alignement, it is clear that some sequences have artifacts (stretches of C):
to_remove=read.table("consensus_bad_sequences_to_remove2.txt")
# filter out these ones
consensus_per_cluster_aln=consensus_per_cluster_aln[-which(names(consensus_per_cluster_aln) %in% to_remove$V1)]
writeXStringSet(consensus_per_cluster_aln, filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET/consensus_clusters.aln")
each representative sequence was then blasted against BOLD database, on the webserver http://www.boldsystems.org/index.php/IDS_IdentificationRequest/view?jobId=varaldi.identificationRequest_varaldi_CBC5D071-B542-4CF3-905C-6937D69AC7E9.H:bold_jobserver-vm:15440606#54_Pachy_c1_n=5
bold=read.table("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/IDEngine_Results_Summary.txt", h=T, sep="\t")
dim(bold)
# remove bad sequences
bold=bold[-which(bold$Query.ID %in% to_remove$V1),]
dim(bold)
head(bold)
# split first column
names=strsplit(as.character(bold$Query.ID), "_")
names=lapply(X = names, FUN=function(x){
sample=x[1]
species=x[2]
cluster=as.numeric(sub(pattern = "c", replacement = "", x = x[3]))
nreads=as.numeric(sub(pattern = "n=", replacement = "", x = x[4]))
res=data.frame(sample, species, cluster, nreads)
return(res)
})
names=do.call(rbind.data.frame, names)
bold=data.frame(names, bold)
head(bold)
#species identified by BOLD:
levels(bold$Best.ID)
Some reads correspond to unspecific amplification of wolbachia endosymbiont, and a few have no hits. They are removed.
wolbachia_samples=bold[(grep(pattern = "Wolbachia", x = bold$Best.ID)),]$Query.ID
no_match_samples=bold[(grep(pattern = "No match", x = bold$Best.ID)),]$Query.ID
bold=bold[-which(bold$Query.ID %in% union(wolbachia_samples,no_match_samples)),]
bold$Best.ID=factor(bold$Best.ID)
dim(bold)
levels(bold$Best.ID)
Plot raw read composition for each sample
liste=split(bold, f = bold$sample)
tab_per_sample=lapply(X = liste, FUN = function(x){
res=tapply(X = x$nreads, INDEX = x$Best.ID, FUN = sum)
res[is.na(res)]=0
res=t(as.data.frame(res))
sample_name=paste("sample", strsplit(as.character(x[1,5]), "_")[[1]][1], strsplit(as.character(x[1,5]), "_")[[1]][2])
res=data.frame(sample_name, res)
return(res)
})
table_per_sample=do.call(rbind.data.frame, tab_per_sample)
head(table_per_sample)
pdf(file = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/BOLD_assignment.pdf")
par(mar=c(12,4,2,2))
for (i in 1:dim(table_per_sample)[1]){
barplot(as.matrix(table_per_sample[i,-1]), las=3, main=table_per_sample$sample_name[i], ylab = "# reads")
}
dev.off()
library("RColorBrewer")
mat=as.matrix(table_per_sample[,-1])
rownames(mat)=table_per_sample$sample_name
heatmap(mat, cexRow=0.6, margins=c(9,7),
col=brewer.pal(9,"Blues"))
The idea is to first perform a general blast using a large db (containing huge diversity, but possibly containing mis-labelled sequences). Then select the list of species found in the first 10 hits grabed from all individual read.
Using BOLD database (BINs)
Download BOLD bins corresponding to Pteromalidae Alysiinae Diapridae Figitidae drosophilinae
cd ~/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db/
cat Pteromalidae.fasta Alysiinae.fasta Diapridae.fasta Figitidae.fasta drosophilinae.fasta > CO1_all.fasta
remove some sequences with uninformative names
library(Biostrings)
seq=readDNAStringSet(filepath = "~/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db/CO1_all.fasta")
# sequences in the CO1 database
length(seq)
# get ID and species names
names_seq=names(seq)
names_seq2=strsplit(x = names_seq, split="|", fixed=T)
names_seq_id=unlist(lapply(names_seq2, FUN=function(x){return(x[1])}))
names_seq_species=(lapply(names_seq2, FUN=function(x){return(x[2])}))
# identify the good species identifications
names_seq_species_OK=unlist(lapply(names_seq_species, FUN = function(x){
cut=strsplit(x = x, split=" ")
n_words=length(cut[[1]])
if (n_words==2 & cut[[1]][2]!="sp.") {
res=1
}
else {
res=0
}
return (res)
}))
names_seq_species=unlist(names_seq_species)
full_name=paste(names_seq_species, names_seq_id, sep = " ")
full_name=sub(pattern = " ", replacement = "_", x = full_name)
tab=data.frame(names_seq_id,names_seq_species, names_seq_species_OK, full_name)
dim(tab)
head(tab)
# subset sequences
# modify seq names
names2=paste(tab$names_seq_species, tab$names_seq_id, sep = " ")
names2=sub(pattern = " ", replacement = "_", x = names2)
names(seq)=names2
seq2=seq[which(names(seq) %in% tab[tab$names_seq_species_OK==1,]$full_name)]
length(seq2)
writeXStringSet(x = seq2, filepath = "~/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db/CO1_all_clean.fasta", format = "fasta")
Construct the bd
cd ~/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db/
makeblastdb -in CO1_all_clean.fasta -dbtype 'nucl' -out CO1_all
Run the blasts
cd /Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET
DB=~/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db/CO1_all
for file in *_short.fa; do
echo $file
blastn -num_threads 20 -query $file -outfmt 6 -max_target_seqs 10 -db $DB -out ../BLASTS/$file.blastn.tab
cut -f2 ../BLASTS/$file.blastn.tab > ../BLASTS/$file.species.txt
done
# get the whole set of species found
cat *.txt > all_species.txt
Analyse the set of species
Construct a table with individual read assignement.
setwd("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BLASTS/")
tab=read.table("all_species.txt", h=F)
head(tab)
dim(tab)
# number of "species"
length(unique(tab$V1))
# construct a table with sample info and the identified species for each assembled read:
ls=list.files(path = ".", pattern = "*.fa.blastn.tab")
ls=as.list(ls)
liste_res=lapply(X = ls, FUN = function(x){
# etract sample informations
sample_name=sub(x = x, pattern = ".fasta.blastn.tab", replacement = "")
sample_name_split=lapply(strsplit(sample_name, split="_"), FUN=function(x){return(x[1:5])})[[1]]
blast_result=read.table(file = x)
blast_result_split=split(x=blast_result$V2, f=blast_result$V1)
# return one species per read (because blast identify 2 hsp per read, and generates 2 lines per read)
species=unlist(lapply(X = blast_result_split, FUN=function(x){
return(x[1])
}))
# n reads
n_reads=length(blast_result_split)
# create a table with sample info and species identified (based on blast):
tab1=data.frame(c(1:n_reads), as.data.frame(x = t(sample_name_split)), species)
names(tab1)=c("read_number", "sample", "species", "location", "year", "month", "best_blast_species")
return(tab1)
})
table=do.call(rbind.data.frame, liste_res)
head(table)
# load the DECIPHER library in R
library(DECIPHER)
setwd("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated")
species_under_score = levels(table$best_blast_species)
species_space= sub(pattern = "_", replacement = " ", x = species_under_score)
n = 10
for (i in 1:length(species_space)){
print (paste0("SAMPLE : ",i," ",species_under_score[i]))
# download n accession numbers for each species
expression = paste0("cytochrome oxidase [TI] NOT 5prime[TI] NOT COII[TI] AND 500:800[SLEN] AND ", species_space[i] ," [Organism]")
command1=paste0("/anaconda2/bin/python ~/python_functions/download_accession_numbers_from_ncbi.py nucleotide \"", expression, "\" ", n, " ", species_under_score[i], ".an.txt")
system(command1)
# download the corresponding sequences
expression=paste0("/anaconda2/bin/python ~/python_functions/download_sequ_from_ncbi.py ", species_under_score[i], ".an.txt", " 20 fasta ", species_under_score[i], ".an.fa")
system(expression)
# align the sequences for each species
file = paste0(species_under_score[i], ".an.fa")
file_out = paste0(species_under_score[i], ".an.aln")
# load the sequences from the file
seqs <- readDNAStringSet(file)
seqs = RemoveGaps(myXStringSet = seqs)
if (length(seqs)>1){
# nucleotide sequences need to be in the same orientation
# if they are not, then they can be reoriented (optional)
seqs = OrientNucleotides(seqs)
# perform the alignment
aligned <- AlignSeqs(seqs)
# write alignment
# write the alignment to a new FASTA file
writeXStringSet(aligned,
file=file_out)
}
}
Merge files and add a few sequences from BOLD (lacking from ncbi):
cd /Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/
cat *.an.fa > all.fa
Modify names and remove the incorrect sequences
db2=readBStringSet("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all.fa")
length(db2)
# remove incorrect sequences
accessions=unlist(lapply(strsplit(names(db2), " "), FUN = function(x){
return(x[1])
}))
#=db2[-which(x = accessions %in% to_remove)]
length(db2)
# shorten names
names=names(db2)
names[1:10]
names_split=strsplit(names, split=" ")
new_names=lapply(names_split, FUN = function(x){
res=paste0(x[2], "_" , x[3], "_", x[1])
return(res)
})
# remove unidentified species
new_names=unlist(new_names)
names(db2)=new_names
db2_clean=db2[-grep(pattern = "sp.", names(db2))]
db2_clean=db2_clean[-grep(pattern = "UNVERIFIED", names(db2_clean))]
length(db2_clean)
writeXStringSet(x = db2_clean, filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean.fa")
Orientation check for all sequences and new alignment
# load the DECIPHER library in R
library(DECIPHER)
# specify the path to the FASTA file (in quotes)
fas <- "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean.fa"
# load the sequences from the file
# change "DNA" to "RNA" or "AA" if necessary
seqs <- readDNAStringSet(fas)
# look at some of the sequences (optional)
seqs
seqs = RemoveGaps(myXStringSet = seqs)
# nucleotide sequences need to be in the same orientation
# if they are not, then they can be reoriented (optional)
seqs = OrientNucleotides(seqs)
# perform the alignment
aligned <- AlignSeqs(seqs)
# view the alignment in a browser (optional)
#BrowseSeqs(aligned, highlight=0)
# write the alignment to a new FASTA file
writeXStringSet(aligned,
file="/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean.aln")
A few are not CO1 sequences and should be removed and replaced by correct sequences (subobscura and obscura):
setwd("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/")
to_remove=read.table("to_remove.txt", h=F)
to_remove$V1
aligned2=aligned[-which(names(aligned) %in% to_remove$V1)]
# remove gaps
seqs = RemoveGaps(myXStringSet = aligned2)
to_add=readDNAStringSet("lacking.fas")
# only those called COI-5P are OK
to_add=to_add[grep(pattern = "COI-5P", x = names(to_add))]
seqs_to_Add = RemoveGaps(myXStringSet = to_add)
names=names(seqs_to_Add)
names2=lapply(strsplit(names, "|", fixed=T), FUN=function(x){
sp=sub(pattern = " ", replacement = "_", x = x[2])
an=x[4]
res=paste0(sp, "_BOLD", an)
return(res)
})
names(seqs_to_Add)=names2
# add to the previous
seqs=c(seqs, seqs_to_Add)
# nucleotide sequences need to be in the same orientation
# if they are not, then they can be reoriented (optional)
seqs = OrientNucleotides(seqs)
# perform the alignment
aligned2 <- AlignSeqs(seqs)
# write the alignment to a new FASTA file
writeXStringSet(aligned2,
file="/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean2.aln")
A few species were filtered out (D. phalerata and Pachychrepoideus D supulchrella, and D. kuntzei):
to_add=readDNAStringSet(filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/to_add_again.fa")
to_add2=readDNAStringSet(filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/D.subpulchrella_toadd.fa")
aligned3=c(aligned2, to_add, to_add2)
# remove gaps
seqs = RemoveGaps(myXStringSet = aligned3)
# nucleotide sequences need to be in the same orientation
# if they are not, then they can be reoriented (optional)
seqs = OrientNucleotides(seqs)
# perform the alignment
aligned3 <- AlignSeqs(seqs)
# view the alignment in a browser (optional)
#BrowseSeqs(aligned3, highlight=0)
# write the alignment to a new FASTA file
writeXStringSet(aligned3,
file="/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean3.aln")
aligned3=readDNAStringSet("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean3.aln")
# calculate a NJ tree
d <- DistanceMatrix(aligned3,
correction="Jukes-Cantor")
dend <- IdClusters(d,
method="NJ",
type="dendrogram",
myXStringSet=aligned3, showPlot = T)
dend %>%
# Custom branches
set("branches_col", "grey") %>% set("branches_lwd", 1) %>%
# Custom labels
set("labels_col", "black") %>% set("labels_cex", 0.1) %>%
plot(horiz=T)
dev.print(device = pdf, "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/Phylogeny_db.pdf")
aligned3=readDNAStringSet("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean3.aln")
consensus_per_cluster_aln2=readDNAStringSet("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET/consensus_clusters.aln")
# remove the Wolbachia hits
consensus_per_cluster_aln2=consensus_per_cluster_aln2[-which(names(consensus_per_cluster_aln2) %in% wolbachia_samples)]
######################
# n sequences in the db
length(aligned3)
# n sequences (consensus per cluster) in our sample
length(consensus_per_cluster_aln2)
full_aln=c(aligned3, reverseComplement(consensus_per_cluster_aln2))
length(full_aln)
#remove gaps
full_aln=RemoveGaps(full_aln)
# change orientation of our samples
#full_aln=OrientNucleotides(full_aln)
# align
full_aln=AlignSeqs(myXStringSet = full_aln)
#BrowseSeqs(full_aln)
# write alignment
writeXStringSet(full_aln, filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean+ours.aln")
construct a NJ phylogeny
# calculate a NJ tree
d <- DistanceMatrix(full_aln,
correction="Jukes-Cantor", processors = NULL, penalizeGapGapMatches=FALSE, penalizeGapLetterMatches=F)
dend <- IdClusters(d,
method="NJ",
type="dendrogram",
myXStringSet=full_aln, showPlot = T)
colours=rep("black", length(full_aln))
colours[grep(labels(dend), pattern = "*_*_c[0-9]_n*")]="red"
dend %>%
# Custom branches
set("branches_col", "grey") %>% set("branches_lwd", 1) %>%
# Custom labels
set("labels_col", colours) %>% set("labels_cex", 0.1) %>%
plot(horiz=T)
dev.print(device = pdf, "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/Phylogeny_db_plus_our_samples_NJ.pdf")
From this first phylogeny, we identified a few strange reads. To check what they are I blasted them against nt: - 44_At_c2_n=1 is a non specific hit = rikketsia sequence (remove) - 54_Pachy_c2_n=1 is a non specific hit (nematode or something like that) - 54_Pachy_c3_n=2 is L. boulardi (contamination?) - 45_Trichopria_c1_n=1 is a non specific hit (virus or something like that)
I remove them from the phylogeny (except 54_Pachy_c3_n=2)
to_remove2=c("44_A.t_c2_n=142", "54_Pachy_c2_n=1", "45_Trichopria_c1_n=1")
full_aln2=full_aln[-which(names(full_aln) %in% to_remove2)]
# remove gaps
seqs = RemoveGaps(myXStringSet = full_aln2)
# nucleotide sequences need to be in the same orientation
# if they are not, then they can be reoriented (optional)
seqs = OrientNucleotides(seqs)
# perform the alignment
full_aln2 <- AlignSeqs(seqs)
# write it to disk
writeXStringSet(full_aln2, filepath = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/FINAL_DATASET/all_clean+ours_clean.aln")
# calculate a NJ tree
d <- DistanceMatrix(full_aln2,
correction="Jukes-Cantor", processors = NULL,
penalizeGapGapMatches=FALSE,
penalizeGapLetterMatches=F)
dend <- IdClusters(d,
method="ML",
type="dendrogram",
myXStringSet=full_aln2, showPlot = T, processors = NULL)
colours=rep("black", length(full_aln2))
colours[grep(labels(dend), pattern = "*_*_c[0-9]_n*")]="red"
dend %>%
# Custom branches
set("branches_col", "grey") %>% set("branches_lwd", 1) %>%
# Custom labels
set("labels_col", colours) %>% set("labels_cex", 0.1) %>%
plot(horiz=T)
dev.print(device = pdf, "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/Phylogeny_db_plus_our_samples_ML.pdf")
I also ran a phylogenetic reconstruction using seaview on the same alignement (all_clean+ours_clean.aln) to obtain aLRT support for branches.
Now based on the phylogeny, I modified the assignations obtained from BOLD in the IDEngine_Results_Summary_corrected.txt table.
bold_modified=read.table("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/IDEngine_Results_Summary_corrected.txt", h=T , sep="\t")
head(bold_modified)
## Query.ID Best.ID Search.DB Top..
## 1 1_D.mel_c1_n=3031 Drosophila melanogaster COI SPECIES DATABASE 99.81
## 2 10_L.b_c1_n=1 Leptopilina boulardi COI SPECIES DATABASE 99.21
## 3 10_L.b_c2_n=1 Leptopilina boulardi ncbi NA
## 4 10_L.b_c3_n=944 Leptopilina boulardi COI SPECIES DATABASE 99.81
## 5 10_L.b_c4_n=401 Leptopilina boulardi COI SPECIES DATABASE 99.77
## 6 10_L.b_c5_n=1049 Leptopilina boulardi COI SPECIES DATABASE 99.81
## Low..
## 1 99.74
## 2 85.43
## 3 NA
## 4 85.92
## 5 85.90
## 6 86.02
# split first column
names=strsplit(as.character(bold_modified$Query.ID), "_")
names=lapply(X = names, FUN=function(x){
sample=x[1]
species=x[2]
cluster=as.numeric(sub(pattern = "c", replacement = "", x = x[3]))
nreads=as.numeric(sub(pattern = "n=", replacement = "", x = x[4]))
res=data.frame(sample, species, cluster, nreads)
return(res)
})
names=do.call(rbind.data.frame, names)
bold_modified=data.frame(names, bold_modified)
head(bold_modified)
## sample species cluster nreads Query.ID Best.ID
## 1 1 D.mel 1 3031 1_D.mel_c1_n=3031 Drosophila melanogaster
## 2 10 L.b 1 1 10_L.b_c1_n=1 Leptopilina boulardi
## 3 10 L.b 2 1 10_L.b_c2_n=1 Leptopilina boulardi
## 4 10 L.b 3 944 10_L.b_c3_n=944 Leptopilina boulardi
## 5 10 L.b 4 401 10_L.b_c4_n=401 Leptopilina boulardi
## 6 10 L.b 5 1049 10_L.b_c5_n=1049 Leptopilina boulardi
## Search.DB Top.. Low..
## 1 COI SPECIES DATABASE 99.81 99.74
## 2 COI SPECIES DATABASE 99.21 85.43
## 3 ncbi NA NA
## 4 COI SPECIES DATABASE 99.81 85.92
## 5 COI SPECIES DATABASE 99.77 85.90
## 6 COI SPECIES DATABASE 99.81 86.02
#species identified by BOLD:
levels(bold_modified$Best.ID)
## [1] "Asobara rufescens"
## [2] "Asobara tabida"
## [3] "Chymomyza amoena"
## [4] "Drosophila hydei"
## [5] "Drosophila immigrans"
## [6] "Drosophila kuntzei"
## [7] "Drosophila melanogaster"
## [8] "Drosophila obscura"
## [9] "Drosophila simulans"
## [10] "Drosophila subobscura"
## [11] "Drosophila suzukii"
## [12] "Leptopilina boulardi"
## [13] "Leptopilina heterotoma"
## [14] "not specific (nematode?)"
## [15] "not specific (Rickettsia)"
## [16] "not specific (virus)"
## [17] "Pteromalidae sp."
## [18] "Trichopria sp. 1 TAS-2012"
## [19] "Wolbachia endosymbiont of Blastophaga sp. 1"
## [20] "Wolbachia endosymbiont of Cerapachys undet"
## [21] "Wolbachia endosymbiont of Microgastrinae"
## [22] "Wolbachia sp. Cerapachys L_MG10_ASANV577-09"
Remove non specific amplifications (symbionts and others):
wolbachia_samples=bold_modified[(grep(pattern = "Wolbachia", x = bold_modified$Best.ID)),]$Query.ID
not_specific=bold_modified[(grep(pattern = "not specific", x = bold_modified$Best.ID)),]$Query.ID
bold_modified=bold_modified[-which(bold_modified$Query.ID %in% union(wolbachia_samples,not_specific)),]
bold_modified$Best.ID=factor(bold_modified$Best.ID)
dim(bold_modified)
## [1] 101 9
levels(bold_modified$Best.ID)
## [1] "Asobara rufescens" "Asobara tabida"
## [3] "Chymomyza amoena" "Drosophila hydei"
## [5] "Drosophila immigrans" "Drosophila kuntzei"
## [7] "Drosophila melanogaster" "Drosophila obscura"
## [9] "Drosophila simulans" "Drosophila subobscura"
## [11] "Drosophila suzukii" "Leptopilina boulardi"
## [13] "Leptopilina heterotoma" "Pteromalidae sp."
## [15] "Trichopria sp. 1 TAS-2012"
Plot read composition for each sample :
liste=split(bold_modified, f = bold_modified$sample)
tab_per_sample=lapply(X = liste, FUN = function(x){
res=tapply(X = x$nreads, INDEX = x$Best.ID, FUN = sum)
res[is.na(res)]=0
res=t(as.data.frame(res))
sample_name=paste("sample", strsplit(as.character(x[1,5]), "_")[[1]][1], strsplit(as.character(x[1,5]), "_")[[1]][2])
res=data.frame(sample_name, res)
return(res)
})
table_per_sample=do.call(rbind.data.frame, tab_per_sample)
head(table_per_sample)
## sample_name Asobara.rufescens Asobara.tabida Chymomyza.amoena
## 1 sample 1 D.mel 0 0 0
## 10 sample 10 L.b 0 0 0
## 11 sample 11 D.mel 0 0 0
## 12 sample 12 D.sim 0 0 0
## 13 sample 13 D.im 0 0 0
## 14 sample 14 D.sub 0 0 0
## Drosophila.hydei Drosophila.immigrans Drosophila.kuntzei
## 1 0 0 0
## 10 0 0 0
## 11 0 0 0
## 12 0 0 0
## 13 0 1594 0
## 14 0 0 0
## Drosophila.melanogaster Drosophila.obscura Drosophila.simulans
## 1 3031 0 0
## 10 0 0 0
## 11 1606 0 0
## 12 0 0 628
## 13 0 0 0
## 14 0 141 0
## Drosophila.subobscura Drosophila.suzukii Leptopilina.boulardi
## 1 0 0 0
## 10 0 0 2397
## 11 0 0 0
## 12 0 0 0
## 13 0 0 0
## 14 166 0 0
## Leptopilina.heterotoma Pteromalidae.sp. Trichopria.sp..1.TAS.2012
## 1 0 0 0
## 10 0 0 0
## 11 0 0 0
## 12 0 0 0
## 13 0 0 0
## 14 0 0 0
pdf(file = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/BOLD_assignment_corrected.pdf")
par(mar=c(12,4,2,2))
for (i in 1:dim(table_per_sample)[1]){
barplot(as.matrix(table_per_sample[i,-1]), las=3, main=table_per_sample$sample_name[i], ylab = "# reads")
}
dev.off()
## quartz_off_screen
## 2
for (i in 1:dim(table_per_sample)[1]){
barplot(as.matrix(table_per_sample[i,-1]), las=3, main=table_per_sample$sample_name[i], ylab = "# reads")
}
library("RColorBrewer")
mat=as.matrix(table_per_sample[,-1])
rownames(mat)=table_per_sample$sample_name
# reorder rows and columns
sort=unlist(lapply(strsplit(rownames(mat), " "), FUN=function(x){
return(x[3])
}))
d=data.frame(1:length(sort), sort)
names(d)[1]="index"
d=d[order(d$sort, decreasing = T),]
mat=mat[d$index,sort(colnames(mat))]
heatmap(mat, cexRow=0.6, margins=c(9,7),
col=brewer.pal(9,"Blues"), Rowv=NA, Colv = NA)
# write matrix to the disk
write.table(mat, file = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/Assignment_table.txt", row.names = T, col.names = T, quote=F)
Finally, the phylogeny with branch support (alignement by muscle, PhyML as implemented in seaview) is constructed and imported :
library(ggtree)
## Warning: package 'ggtree' was built under R version 3.5.2
## ggtree v1.14.6 For help: https://guangchuangyu.github.io/software/ggtree
##
## If you use ggtree in published research, please cite the most appropriate paper(s):
##
## - Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628
##
## - Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, accepted. doi: 10.1093/molbev/msy194
##
## Attaching package: 'ggtree'
## The following object is masked _by_ '.GlobalEnv':
##
## multiplot
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.5.2
## Loading required package: ggplot2
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
## The following object is masked from 'package:ggtree':
##
## ggsave
tree=read.tree("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean+ours_clean-PhyML_tree")
tip=tree$tip.label
colour_tips=rep("blue", length(tree$tip.label))
colour_tips[grep(x = colour_tips, pattern = "*_*_c[0-9]_n*")]="red"
dd=data.frame(tree$tip.label, colour_tips)
aLRT=tree$node.label
aLRT[which(aLRT<0.7)]=""
p2 =ggplot(tree, aes(x, y)) + geom_tree(layout = "rectangular") +
theme_tree() +
geom_treescale() +
geom_tiplab(size = 0.4, aes(colour=rep("red", 871))) +
geom_text2(aes(subset=!isTip, label=c(rep(0, length(tree$tip.label)), aLRT)), size=0.4, hjust=0)
p2
ggsave(p2,device = "pdf", filename = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/PhyML.pdf")
## Saving 7 x 5 in image
### without the small sequence
# see https://github.com/GuangchuangYu/plotTree-ggtree/blob/master/01_basic_strain_info.R
tree=read.tree("/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/CO1_db_curated/all_clean+ours_clean-PhyML_tree2")
tip=tree$tip.label
colour_tips=rep("blue", length(tree$tip.label))
colour_tips[grep(x = tip, pattern = "*_*_c[0-9]_n*")]="red"
dd=data.frame(tree$tip.label, colour_tips)
rownames(dd)=tree$tip.label
aLRT=tree$node.label
aLRT[which(aLRT<0.7)]=""
# show node numbers
p <- ggtree(tree, color="grey",linetype="dotted") %<+% dd +
geom_tree(layout = "rectangular") +
theme_tree() +
geom_treescale(x = 0, y = 400) +
geom_tiplab(aes(color=colour_tips), size=0.8) +
#geom_text2(aes(subset=!isTip, label=c(rep(0, length(tree$tip.label)), aLRT)), size=0.2, hjust=-0.4) +
geom_text2(aes(subset=!isTip, label=node), hjust=-0.3, size=0.6)
p
ggsave(p,device = "pdf", filename = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/PhyML2_node_numbers.pdf", width = 10, height = 15)
# highlight the clades that are of interests:
# example Chimomiza node 686
viewClade(p+geom_tiplab(), node=462)
# our clades of interest :
species_highlight=c(686, # chymomiza
838, #D. mel
859, # D.s im
818, # D. suz
806, # D. hydei
794, # D. kuntzei
630, # D. sub
625, # D. obscura
611, # D. im
573, # A.r
566, # A.t
540, #PAchy
485, # Tricho
437, #L. he
462) #L.b
#species_highlight=c(686, 831, 859, 806, 794, 624, 611, 640, 566, 573, 540, 462, 437, 485, 817)
tree2 = groupClade(tree, .node = species_highlight)
#ggtree(tree, aes(color=group, linetype=group))
p <- ggtree(tree2, aes(color=group)) %<+% dd +
geom_tree(layout = "rectangular") +
theme_tree() +
geom_treescale(x = 0, y = 400)+
geom_tiplab(aes(color=colour_tips), size=0.8) + scale_color_manual(values = c("grey", rep("black", 14), "blue", "red" )) +
geom_text2(aes(subset=!isTip, label=c(rep(0, length(tree$tip.label)), aLRT)), size=0.6, hjust=-0.4)
p
ggsave(p,device = "pdf", filename = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/PhyML2a.pdf", width = 10, height = 15)
alpha=0.2
extend=0.05
plot_grid(p <- ggtree(tree, color="grey") %<+% dd +
geom_tree(layout = "rectangular") +
theme_tree() +
geom_treescale(x = 0, y = 400, offset = 2) +
geom_tiplab(aes(color=colour_tips), size=0.8) + scale_color_manual(values = c("gray45", "firebrick1")) +
geom_text2(aes(subset=!isTip, label=c(rep(0, length(tree$tip.label)), aLRT)), size=0.6, hjust=-0.4) +
geom_hilight(node = species_highlight[1], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[2], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[3], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[4], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[5], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[6], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[7], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[8], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[9], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[10], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[11], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[12], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[13], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[14], "steelblue", extend = extend, alpha = alpha) +
geom_hilight(node = species_highlight[15], "steelblue", extend = extend, alpha = alpha))
ggsave(p,device = "pdf", filename = "/Users/julienvaraldi/Documents/Manips/Viromics_Metagenomics/CO1/BOLD_identification/PhyML2b.pdf", width = 10, height = 15)